/*-------------------------------------------------------------------------
 *
 * regprefix.c
 *      Extract a common prefix, if any, from a compiled regex.
 *
 *
 * Portions Copyright (c) 2012-2017, PostgreSQL Global Development Group
 * Portions Copyright (c) 1998, 1999 Henry Spencer
 *
 * IDENTIFICATION
 *      src/backend/regex/regprefix.c
 *
 *-------------------------------------------------------------------------
 */

#include "regex/regguts.h"


/*
 * forward declarations
 */
static int findprefix(struct cnfa *cnfa, struct colormap *cm,
           chr *string, size_t *slength);


/*
 * pg_regprefix - get common prefix for regular expression
 *
 * Returns one of:
 *    REG_NOMATCH: there is no common prefix of strings matching the regex
 *    REG_PREFIX: there is a common prefix of strings matching the regex
 *    REG_EXACT: all strings satisfying the regex must match the same string
 *    or a REG_XXX error code
 *
 * In the non-failure cases, *string is set to a malloc'd string containing
 * the common prefix or exact value, of length *slength (measured in chrs
 * not bytes!).
 *
 * This function does not analyze all complex cases (such as lookaround
 * constraints) exactly.  Therefore it is possible that some strings matching
 * the reported prefix or exact-match string do not satisfy the regex.  But
 * it should never be the case that a string satisfying the regex does not
 * match the reported prefix or exact-match string.
 */
int
pg_regprefix(regex_t *re,
             chr **string,
             size_t *slength)
{// #lizard forgives
    struct guts *g;
    struct cnfa *cnfa;
    int            st;

    /* sanity checks */
    if (string == NULL || slength == NULL)
        return REG_INVARG;
    *string = NULL;                /* initialize for failure cases */
    *slength = 0;
    if (re == NULL || re->re_magic != REMAGIC)
        return REG_INVARG;
    if (re->re_csize != sizeof(chr))
        return REG_MIXED;

    /* Initialize locale-dependent support */
    pg_set_regex_collation(re->re_collation);

    /* setup */
    g = (struct guts *) re->re_guts;
    if (g->info & REG_UIMPOSSIBLE)
        return REG_NOMATCH;

    /*
     * This implementation considers only the search NFA for the topmost regex
     * tree node.  Therefore, constraints such as backrefs are not fully
     * applied, which is allowed per the function's API spec.
     */
    assert(g->tree != NULL);
    cnfa = &g->tree->cnfa;

    /*
     * Since a correct NFA should never contain any exit-free loops, it should
     * not be possible for our traversal to return to a previously visited NFA
     * state.  Hence we need at most nstates chrs in the output string.
     */
    *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
    if (*string == NULL)
        return REG_ESPACE;

    /* do it */
    st = findprefix(cnfa, &g->cmap, *string, slength);

    assert(*slength <= cnfa->nstates);

    /* clean up */
    if (st != REG_PREFIX && st != REG_EXACT)
    {
        FREE(*string);
        *string = NULL;
        *slength = 0;
    }

    return st;
}

/*
 * findprefix - extract common prefix from cNFA
 *
 * Results are returned into the preallocated chr array string[], with
 * *slength (which must be preset to zero) incremented for each chr.
 */
static int                        /* regprefix return code */
findprefix(struct cnfa *cnfa,
           struct colormap *cm,
           chr *string,
           size_t *slength)
{// #lizard forgives
    int            st;
    int            nextst;
    color        thiscolor;
    chr            c;
    struct carc *ca;

    /*
     * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     * anchored left.  If we have both BOS and BOL, they must go to the same
     * next state.
     */
    st = cnfa->pre;
    nextst = -1;
    for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
    {
        if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
        {
            if (nextst == -1)
                nextst = ca->to;
            else if (nextst != ca->to)
                return REG_NOMATCH;
        }
        else
            return REG_NOMATCH;
    }
    if (nextst == -1)
        return REG_NOMATCH;

    /*
     * Scan through successive states, stopping as soon as we find one with
     * more than one acceptable transition character (either multiple colors
     * on out-arcs, or a color with more than one member chr).
     *
     * We could find a state with multiple out-arcs that are all labeled with
     * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     * In that case we add the chr "c" to the output string but then exit the
     * loop with nextst == -1.  This leaves a little bit on the table: if the
     * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     * to the prefix.  But chasing multiple parallel state chains doesn't seem
     * worth the trouble.
     */
    do
    {
        st = nextst;
        nextst = -1;
        thiscolor = COLORLESS;
        for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
        {
            /* We can ignore BOS/BOL arcs */
            if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
                continue;
            /* ... but EOS/EOL arcs terminate the search, as do LACONs */
            if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
                ca->co >= cnfa->ncolors)
            {
                thiscolor = COLORLESS;
                break;
            }
            if (thiscolor == COLORLESS)
            {
                /* First plain outarc */
                thiscolor = ca->co;
                nextst = ca->to;
            }
            else if (thiscolor == ca->co)
            {
                /* Another plain outarc for same color */
                nextst = -1;
            }
            else
            {
                /* More than one plain outarc color terminates the search */
                thiscolor = COLORLESS;
                break;
            }
        }
        /* Done if we didn't find exactly one color on plain outarcs */
        if (thiscolor == COLORLESS)
            break;
        /* The color must be a singleton */
        if (cm->cd[thiscolor].nschrs != 1)
            break;
        /* Must not have any high-color-map entries */
        if (cm->cd[thiscolor].nuchrs != 0)
            break;

        /*
         * Identify the color's sole member chr and add it to the prefix
         * string.  In general the colormap data structure doesn't provide a
         * way to find color member chrs, except by trying GETCOLOR() on each
         * possible chr value, which won't do at all.  However, for the cases
         * we care about it should be sufficient to test the "firstchr" value,
         * that is the first chr ever added to the color.  There are cases
         * where this might no longer be a member of the color (so we do need
         * to test), but none of them are likely to arise for a character that
         * is a member of a common prefix.  If we do hit such a corner case,
         * we just fall out without adding anything to the prefix string.
         */
        c = cm->cd[thiscolor].firstchr;
        if (GETCOLOR(cm, c) != thiscolor)
            break;

        string[(*slength)++] = c;

        /* Advance to next state, but only if we have a unique next state */
    } while (nextst != -1);

    /*
     * If we ended at a state that only has EOS/EOL outarcs leading to the
     * "post" state, then we have an exact-match string.  Note this is true
     * even if the string is of zero length.
     */
    nextst = -1;
    for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
    {
        if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
        {
            if (nextst == -1)
                nextst = ca->to;
            else if (nextst != ca->to)
            {
                nextst = -1;
                break;
            }
        }
        else
        {
            nextst = -1;
            break;
        }
    }
    if (nextst == cnfa->post)
        return REG_EXACT;

    /*
     * Otherwise, if we were unable to identify any prefix characters, say
     * NOMATCH --- the pattern is anchored left, but doesn't specify any
     * particular first character.
     */
    if (*slength > 0)
        return REG_PREFIX;

    return REG_NOMATCH;
}
